# 组合方案：pdfplumber + pytesseract
import pdfplumber
import pytesseract
from PIL import Image


def convert_complex_pdf(pdf_path, txt_path):
    with pdfplumber.open(pdf_path) as pdf:
        text_blocks = []
        for page in pdf.pages:
            # 提取文本块
            if page.extract_text():
                text_blocks.append(page.extract_text())

            # 处理公式图像
            for img in page.images:
                im = Image.open(img["stream"])
                formula = pytesseract.image_to_string(im, config='--psm 6')
                text_blocks.append(f"\n[FORMULA_START]{formula}[FORMULA_END]\n")

            # 提取表格
            for table in page.extract_tables():
                table_text = "\n".join(["\t".join(row) for row in table])
                text_blocks.append(f"\n[TABLE_START]\n{table_text}\n[TABLE_END]\n")

        with open(txt_path, "w", encoding="utf-8") as f:
            f.write("\n".join(text_blocks))


convert_complex_pdf("髋关节置换术后定向肌群引导训练_华莉.pdf", "髋关节置换术后定向肌群引导训练_华莉.txt")
